Open In Colab

Mounting Google Drive

In [1]:
#from google.colab import drive
#drive.mount('/content/drive/')

Importing Libraries

In [2]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import time, os, sys, itertools, re 
from PIL import Image
import warnings, pickle, string
from dateutil import parser
%matplotlib inline

# Data Visualization
import cufflinks as cf
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot

from ftfy import fix_text, badness

# Traditional Modeling
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Tools & Evaluation metrics
from sklearn.metrics import confusion_matrix, classification_report, auc
from sklearn.metrics import roc_curve, accuracy_score, precision_recall_curve
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

Reading the data from excel

In [3]:
#data=pd.read_excel('/content/drive/MyDrive/Capstone/input_data.xlsx')
data=pd.read_excel('input_data.xlsx')
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8500 entries, 0 to 8499
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Short description  8492 non-null   object
 1   Description        8499 non-null   object
 2   Caller             8500 non-null   object
 3   Assignment group   8500 non-null   object
dtypes: object(4)
memory usage: 265.8+ KB

Exploratory Data Analysis

In [4]:
data.head()
Out[4]:
Short description Description Caller Assignment group
0 login issue -verified user details.(employee# & manager na... spxjnwir pjlcoqds GRP_0
1 outlook \r\n\r\nreceived from: hmjdrvpb.komuaywn@gmail... hmjdrvpb komuaywn GRP_0
2 cant log in to vpn \r\n\r\nreceived from: eylqgodm.ybqkwiam@gmail... eylqgodm ybqkwiam GRP_0
3 unable to access hr_tool page unable to access hr_tool page xbkucsvz gcpydteq GRP_0
4 skype error skype error owlgqjme qhcozdfx GRP_0
In [5]:
assignment_group_count=data['Assignment group'].value_counts()
assignment_group_count.describe()
Out[5]:
count      74.000000
mean      114.864865
std       465.747516
min         1.000000
25%         5.250000
50%        26.000000
75%        84.000000
max      3976.000000
Name: Assignment group, dtype: float64
In [6]:
plt.subplots(figsize=(50,10))
ax=sns.countplot(x='Assignment group', data=data)
ax.set_xticklabels(ax.get_xticklabels(), rotation=30)
plt.tight_layout
plt.show()
In [7]:
assignment_group_count.head(50)
Out[7]:
GRP_0     3976
GRP_8      661
GRP_24     289
GRP_12     257
GRP_9      252
GRP_2      241
GRP_19     215
GRP_3      200
GRP_6      184
GRP_13     145
GRP_10     140
GRP_5      129
GRP_14     118
GRP_25     116
GRP_33     107
GRP_4      100
GRP_29      97
GRP_18      88
GRP_16      85
GRP_17      81
GRP_31      69
GRP_7       68
GRP_34      62
GRP_26      56
GRP_40      45
GRP_28      44
GRP_41      40
GRP_15      39
GRP_30      39
GRP_42      37
GRP_20      36
GRP_45      35
GRP_22      31
GRP_1       31
GRP_11      30
GRP_21      29
GRP_47      27
GRP_62      25
GRP_48      25
GRP_23      25
GRP_60      20
GRP_39      19
GRP_27      18
GRP_37      16
GRP_36      15
GRP_44      15
GRP_50      14
GRP_65      11
GRP_53      11
GRP_52       9
Name: Assignment group, dtype: int64
In [8]:
assignment_group_count.tail(24)
Out[8]:
GRP_51    8
GRP_55    8
GRP_46    6
GRP_49    6
GRP_59    6
GRP_43    5
GRP_66    4
GRP_32    4
GRP_56    3
GRP_58    3
GRP_68    3
GRP_38    3
GRP_63    3
GRP_57    2
GRP_54    2
GRP_71    2
GRP_69    2
GRP_72    2
GRP_67    1
GRP_61    1
GRP_73    1
GRP_64    1
GRP_35    1
GRP_70    1
Name: Assignment group, dtype: int64

Check Missing Values in dataframe

In [9]:
data.isnull().sum()
Out[9]:
Short description    8
Description          1
Caller               0
Assignment group     0
dtype: int64
In [10]:
data[data["Short description"].isnull()]
Out[10]:
Short description Description Caller Assignment group
2604 NaN \r\n\r\nreceived from: ohdrnswl.rezuibdt@gmail... ohdrnswl rezuibdt GRP_34
3383 NaN \r\n-connected to the user system using teamvi... qftpazns fxpnytmk GRP_0
3906 NaN -user unable tologin to vpn.\r\n-connected to... awpcmsey ctdiuqwe GRP_0
3910 NaN -user unable tologin to vpn.\r\n-connected to... rhwsmefo tvphyura GRP_0
3915 NaN -user unable tologin to vpn.\r\n-connected to... hxripljo efzounig GRP_0
3921 NaN -user unable tologin to vpn.\r\n-connected to... cziadygo veiosxby GRP_0
3924 NaN name:wvqgbdhm fwchqjor\nlanguage:\nbrowser:mic... wvqgbdhm fwchqjor GRP_0
4341 NaN \r\n\r\nreceived from: eqmuniov.ehxkcbgj@gmail... eqmuniov ehxkcbgj GRP_0

Copy Short Description to Description if the Description value is NaN

In [11]:
data[data["Description"].isnull()]=data["Short description"]
In [12]:
data[data["Description"].isnull()]
Out[12]:
Short description Description Caller Assignment group
In [13]:
data['Short description'] = data['Short description'].replace(np.nan, '', regex=True)
In [14]:
data.isnull().sum()
Out[14]:
Short description    0
Description          0
Caller               0
Assignment group     0
dtype: int64

Create a rule based engine

In [15]:
#df_rules = pd.read_csv('/content/drive/MyDrive/Capstone/Rule_matrix.csv')
df_rules = pd.read_csv("Rule_matrix.csv")
In [16]:
def applyRules(datadf,rulesdf,Description,ShortDescription):
    datadf['pred_group'] = np.nan
    for i, row in rulesdf.iterrows():
        for j, row in datadf.iterrows():
            if pd.notna(datadf[ShortDescription][j]):
                if (('erp' in datadf[ShortDescription][j]) and (('EU_tool' in datadf[ShortDescription][j]))):
                        datadf['pred_group'][j] = 'GRP_25'
        for j, row in datadf.iterrows():
            if pd.notna(datadf[Description][j]):
                if (datadf[Description][j] == 'the'):
                        datadf['pred_group'][j] = 'GRP_17' 
                
                if (('finance_app' in datadf[ShortDescription][j]) and ('HostName_1132' not in datadf[ShortDescription][j])):
                    datadf['pred_group'][j] = 'GRP_55'
                
                if (('processor' in datadf[Description][j]) and ('engg' in datadf[Description][j])):
                    datadf['pred_group'][j] = 'GRP_58'
        
        if rulesdf['Short Desc Rule'][i] == 'begins with' and rulesdf['Desc Rule'][i] == 'begins with' and pd.isna(rulesdf['User'][i]):
            for j, row in datadf.iterrows():
                if pd.notna(datadf[ShortDescription][j]) and pd.notna(datadf[Description][j]):
                    if ((datadf[ShortDescription][j].startswith(rulesdf['Short Dec Keyword'][i])) and (datadf[Description][j].startswith(rulesdf['Dec keyword'][i]))):
                        datadf['pred_group'][j] = rulesdf['Group'][i]
        if pd.isna(rulesdf['Short Desc Rule'][i]) and rulesdf['Desc Rule'][i] == 'begins with' and pd.notna(rulesdf['User'][i]):
            for j, row in datadf.iterrows():
                if pd.notna(datadf[Description][j]) and pd.notna(datadf['Caller'][j]):
                    if ((datadf[Description][j].startswith(rulesdf['Desc Rule'][i]) and (rulesdf['User'][i] == datadf['Caller'][j]))):
                        datadf['pred_group'][j] = rulesdf['Group'][i]
        if rulesdf['Short Desc Rule'][i] == 'contains' and pd.notna(rulesdf['User'][i]):
            for j, row in datadf.iterrows():
                if (pd.notna(datadf[ShortDescription][j]) and pd.notna(datadf['Caller'][j])):
                     if ((rulesdf['Short Dec Keyword'][i] in datadf[ShortDescription][j]) and (rulesdf['User'][i] == datadf['Caller'][j])):
                        datadf['pred_group'][j] = rulesdf['Group'][i]
        if rulesdf['Short Desc Rule'][i] == 'contains' and pd.isna(rulesdf['Desc Rule'][i]) and pd.isna(rulesdf['User'][i]):
            for j, row in datadf.iterrows():
                #print(j)
                if pd.notna(datadf[ShortDescription][j]):
                    if (rulesdf['Short Dec Keyword'][i] in datadf[ShortDescription][j]):
                        datadf['pred_group'][j] = rulesdf['Group'][i]
        if pd.isna(rulesdf['Short Desc Rule'][i]) and rulesdf['Desc Rule'][i] == 'begins with' and pd.isna(rulesdf['User'][i]):
            for j, row in datadf.iterrows():
                if pd.notna(datadf[Description][j]):
                    if (datadf[Description][j].startswith(rulesdf['Dec keyword'][i])):
                        datadf['pred_group'][j] = rulesdf['Group'][i]
        if pd.isna(rulesdf['Short Desc Rule'][i]) and rulesdf['Desc Rule'][i] == 'contains' and pd.isna(rulesdf['User'][i]):
            for j, row in datadf.iterrows():
                if pd.notna(datadf[Description][j]):
                    if (rulesdf['Dec keyword'][i] in datadf[Description][j]):
                        datadf['pred_group'][j] = rulesdf['Group'][i]
       

    return datadf
In [17]:
rules_applied_df = applyRules(data,df_rules,'Description','Short description')
rules_applied_df
Out[17]:
Short description Description Caller Assignment group pred_group
0 login issue -verified user details.(employee# & manager na... spxjnwir pjlcoqds GRP_0 NaN
1 outlook \r\n\r\nreceived from: hmjdrvpb.komuaywn@gmail... hmjdrvpb komuaywn GRP_0 NaN
2 cant log in to vpn \r\n\r\nreceived from: eylqgodm.ybqkwiam@gmail... eylqgodm ybqkwiam GRP_0 NaN
3 unable to access hr_tool page unable to access hr_tool page xbkucsvz gcpydteq GRP_0 NaN
4 skype error skype error owlgqjme qhcozdfx GRP_0 NaN
... ... ... ... ... ...
8495 emails not coming in from zz mail \r\n\r\nreceived from: avglmrts.vhqmtiua@gmail... avglmrts vhqmtiua GRP_29 NaN
8496 telephony_software issue telephony_software issue rbozivdq gmlhrtvp GRP_0 NaN
8497 vip2: windows password reset for tifpdchb pedx... vip2: windows password reset for tifpdchb pedx... oybwdsgx oxyhwrfz GRP_0 NaN
8498 machine não está funcionando i am unable to access the machine utilities to... ufawcgob aowhxjky GRP_62 NaN
8499 an mehreren pc`s lassen sich verschiedene prgr... an mehreren pc`s lassen sich verschiedene prgr... kqvbrspl jyzoklfx GRP_49 NaN

8500 rows × 5 columns

In [18]:
rules_applied_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8500 entries, 0 to 8499
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Short description  8500 non-null   object
 1   Description        8500 non-null   object
 2   Caller             8500 non-null   object
 3   Assignment group   8500 non-null   object
 4   pred_group         296 non-null    object
dtypes: object(5)
memory usage: 332.2+ KB
In [19]:
rules_applied_df = rules_applied_df[(rules_applied_df['pred_group'].isna())]
rules_applied_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8204 entries, 0 to 8499
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Short description  8204 non-null   object
 1   Description        8204 non-null   object
 2   Caller             8204 non-null   object
 3   Assignment group   8204 non-null   object
 4   pred_group         0 non-null      object
dtypes: object(5)
memory usage: 384.6+ KB
In [20]:
assignment_group_count=rules_applied_df['Assignment group'].value_counts()
assignment_group_count.describe()
Out[20]:
count      63.000000
mean      130.222222
std       485.313382
min         1.000000
25%        11.000000
50%        31.000000
75%        98.500000
max      3834.000000
Name: Assignment group, dtype: float64

Concatenate Short Description and Description Column into New Description, drop the previous columns

In [21]:
#Concatenate Short Description and Description columns
rules_applied_df['New Description'] = rules_applied_df['Description'] + ' ' +rules_applied_df['Short description']

clean_data=rules_applied_df.drop(['Short description', 'Description', 'pred_group'], axis=1)
In [22]:
clean_data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8204 entries, 0 to 8499
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Caller            8204 non-null   object
 1   Assignment group  8204 non-null   object
 2   New Description   8204 non-null   object
dtypes: object(3)
memory usage: 256.4+ KB

Fixing Garbled Text/ Mojibake using ftfy library

In [23]:
# Write a function to apply to the dataset to detect Mojibakes
def is_mojibake_impacted(text):
    if not badness.sequence_weirdness(text):
        # nothing weird, should be okay
        return True
    try:
        text.encode('sloppy-windows-1252')
    except UnicodeEncodeError:
        # Not CP-1252 encodable, probably fine
        return True
    else:
        # Encodable as CP-1252, Mojibake alert level high
        return False
# Check the dataset for mojibake impact
clean_data[~clean_data.iloc[:,:].applymap(is_mojibake_impacted).all(1)]
Out[23]:
Caller Assignment group New Description
99 ecprjbod litmjwsy GRP_0 \n\nreceived from: ecprjbod.litmjwsy@gmail.com...
116 bgqpotek cuxakvml GRP_0 \r\n\r\nreceived from: bgqpotek.cuxakvml@gmail...
124 tvcdfqgp nrbcqwgj GRP_0 from: tvcdfqgp nrbcqwgj \nsent: friday, octobe...
164 tycludks cjofwigv GRP_0 \n\nreceived from: abcdri@company.com\n\nwindy...
170 fbvpcytz nokypgvx GRP_18 \n\nreceived from: fbvpcytz.nokypgvx@gmail.com...
... ... ... ...
8470 azxhejvq fyemlavd GRP_16 from: mikhghytr wafglhdrhjop \nsent: thursday,...
8471 xqyjztnm onfusvlz GRP_30 to 小贺,早上电脑开机开不出来 电...
8480 nlearzwi ukdzstwi GRP_9 \r\n\r\nreceived from: nlearzwi.ukdzstwi@gmail...
8498 ufawcgob aowhxjky GRP_62 i am unable to access the machine utilities to...
8499 kqvbrspl jyzoklfx GRP_49 an mehreren pc`s lassen sich verschiedene prgr...

820 rows × 3 columns

In [24]:
# Take an example of row# 8471 Short Desc and fix it
print('Grabled text: \033[1m%s\033[0m\nFixed text: \033[1m%s\033[0m' % (clean_data['New Description'][8471], 
                                                                        fix_text(clean_data['New Description'][8471])))

# List all mojibakes defined in ftfy library
print('\nMojibake Symbol RegEx:\n', badness.MOJIBAKE_SYMBOL_RE.pattern)
Grabled text: to 小贺,早上电脑开机开不出来 电脑开机开不出来
Fixed text: to 小贺,早上电脑开机开不出来 电脑开机开不出来

Mojibake Symbol RegEx:
 [ÂÃÎÏÐÑØÙĂĎĐŃŘŮ][€-Ÿ€ƒ‚„†‡ˆ‰‹Œ“•˜œŸ¡¢£¤¥¦§¨ª«¬¯°±²³µ¶·¸¹º¼½¾¿ˇ˘˝]|[ÂÃÎÏÐÑØÙĂĎĐŃŘŮ][›»‘”´©™]\w|×[€-Ÿƒ‚„†‡ˆ‰‹Œ“•˜œŸ¡¦§¨ª«¬¯°²³ˇ˘›‘”´©™]|[¬√][ÄÅÇÉÑÖÜáàâäãåçéèêëíìîïñúùûü†¢£§¶ß®©™≠ÆØ¥ªæø≤≥]|\w√[±∂]\w|◊|[ðđ][ŸŸ]|â€|вЂ[љћ¦°№™ќ“”]
In [25]:
# Sanitize the dataset from Mojibakes
clean_data['New Description'] = clean_data['New Description'].apply(fix_text)

# Visualize that row# 8471
clean_data.loc[8471]
Out[25]:
Caller                      xqyjztnm onfusvlz
Assignment group                       GRP_30
New Description     to 小贺,早上电脑开机开不出来 电脑开机开不出来
Name: 8471, dtype: object

Cleaning & Processing the data

In [26]:
def date_validity(date_str):
    try:
        parser.parse(date_str)
        return True
    except:
        return False
In [27]:
def process(text_string):
    text=text_string.lower()
    text_string = ' '.join([w for w in text_string.split() if not date_validity(w)])
    text_string = re.sub(r"received from:",'',text_string)
    text_string = re.sub(r"from:",' ',text_string)
    text_string = re.sub(r"to:",' ',text_string)
    text_string = re.sub(r"subject:",' ',text_string)
    text_string = re.sub(r"sent:",' ',text_string)
    text_string = re.sub(r"ic:",' ',text_string)
    text_string = re.sub(r"cc:",' ',text_string)
    text_string = re.sub(r"bcc:",' ',text_string)
    text_string = re.sub(r'\S*@\S*\s?', '', text_string)
    text_string = re.sub(r'\d+','' ,text_string)
    text_string = re.sub(r'\n',' ',text_string)
    text_string = re.sub(r'#','', text_string)
    text_string = re.sub(r'&;?', 'and',text_string)
    text_string = re.sub(r'\&\w*;', '', text_string)
    text_string = re.sub(r'https?:\/\/.*\/\w*', '', text_string)  
    text_string= ''.join(c for c in text_string if c <= '\uFFFF') 
    text_string = text_string.strip()
    text_string = ' '.join(re.sub("[^\u0030-\u0039\u0041-\u005a\u0061-\u007a]", " ", text_string).split())
    text_string = re.sub(r"\s+[a-zA-Z]\s+", ' ', text_string)
    text_string = re.sub(' +', ' ', text_string)
    text_string = text_string.strip()
    return text_string
  
In [28]:
clean_data["Clean_Description"] = clean_data["New Description"].apply(process)
In [29]:
clean_data
Out[29]:
Caller Assignment group New Description Clean_Description
0 spxjnwir pjlcoqds GRP_0 -verified user details.(employee# & manager na... verified user details employee and manager nam...
1 hmjdrvpb komuaywn GRP_0 \n\nreceived from: hmjdrvpb.komuaywn@gmail.com... hello team my meetings skype meetings etc are ...
2 eylqgodm ybqkwiam GRP_0 \n\nreceived from: eylqgodm.ybqkwiam@gmail.com... hi cannot log on to vpn best cant log in to vpn
3 xbkucsvz gcpydteq GRP_0 unable to access hr_tool page unable to access... unable to access hr tool page unable to access...
4 owlgqjme qhcozdfx GRP_0 skype error skype error skype error skype error
... ... ... ... ...
8495 avglmrts vhqmtiua GRP_29 \n\nreceived from: avglmrts.vhqmtiua@gmail.com... good afternoon am not receiving the emails tha...
8496 rbozivdq gmlhrtvp GRP_0 telephony_software issue telephony_software issue telephony software issue telephony software issue
8497 oybwdsgx oxyhwrfz GRP_0 vip2: windows password reset for tifpdchb pedx... vip windows password reset for tifpdchb pedxru...
8498 ufawcgob aowhxjky GRP_62 i am unable to access the machine utilities to... i am unable to access the machine utilities to...
8499 kqvbrspl jyzoklfx GRP_49 an mehreren pc`s lassen sich verschiedene prgr... an mehreren pc lassen sich verschiedene prgram...

8204 rows × 4 columns

Language Translation

In [30]:
!pip install langdetect
Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com/simple, https://urm.nvidia.com/artifactory/api/pypi/sw-colossus-pypi/simple
Requirement already satisfied: langdetect in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (1.0.8)
Requirement already satisfied: six in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from langdetect) (1.15.0)
In [31]:
from langdetect import detect
    
def fn_lang_detect(df):                                        
   try:                                                          
      return detect(df)                                      
   except:                                                       
      return 'no'                                                  

clean_data['language'] = clean_data['Clean_Description'].apply(fn_lang_detect)
In [32]:
x = clean_data["language"].value_counts()
x=x.sort_index()
plt.figure(figsize=(10,6))
ax= sns.barplot(x.index, x.values, alpha=0.8)
plt.title("Distribution of text by language")
plt.ylabel('number of records')
plt.xlabel('Language')
rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')
plt.show();
C:\Users\aroy\Anaconda3\envs\myenv\lib\site-packages\seaborn\_decorators.py:43: FutureWarning:

Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.

We can see that most of the tickets are in english, followed by tickets in German language. We need to translate these into english.

In [33]:
#german_data = pd.read_csv("/content/drive/MyDrive/Capstone/german.csv")
german_data = pd.read_csv('german.csv')
In [34]:
german_data
Out[34]:
German English
0 ich I
1 sie you
2 das the
3 ist is
4 du you
... ... ...
10392 vereinigung Union
10393 mittelpunkt Focus
10394 page page
10395 andeuten imply
10396 helene helene

10397 rows × 2 columns

In [35]:
german_dictionary = german_data.to_dict(orient='records')
In [36]:
def translate_function(text):
    translated_text = []
    text_split = text.split()
    for text in text_split:
        word_found = False
        for item in range(len(german_dictionary)):
            if text == german_dictionary[item]["German"]:
                translated_text.append(german_dictionary[item]["English"])
                word_found = True
        if word_found == False:
            translated_text.append(text)
    translate = ' '.join([word for word in translated_text])        
    return translate

clean_data["Translated Text"] = clean_data["Clean_Description"].apply(translate_function)
In [37]:
clean_data.tail(10)
Out[37]:
Caller Assignment group New Description Clean_Description language Translated Text
8490 mpihysnw wrctgoan GRP_29 please contact ed pasgryowski (pasgryo) about ... please contact ed pasgryowski pasgryo about hi... en please contact ed pasgryowski pasgryo about hi...
8491 jxgobwrm qkugdipo GRP_34 \n\nreceived from: jxgobwrm.qkugdipo@gmail.com... i need vpn for my new laptop name llv knethyen... en i need vpn for my new Laptop Surname llv kneth...
8492 tmopbken ibzougsd GRP_0 hr_tool etime option not visitble hr_tool eti... hr tool etime option not visitble hr tool etim... en hr tool etime option distress visitble hr tool...
8493 ipwjorsc uboapexr GRP_10 i am sorry, i have another two accounts that n... i am sorry have another two accounts that need... en i at the sorry have another two accounts did n...
8494 cpmaidhj elbaqmtp GRP_3 tablet needs reimaged due to multiple issues w... tablet needs reimaged due to multiple issues w... en tablet needs reimaged due to multiple issues w...
8495 avglmrts vhqmtiua GRP_29 \n\nreceived from: avglmrts.vhqmtiua@gmail.com... good afternoon am not receiving the emails tha... en good afternoon at the distress receiving the e...
8496 rbozivdq gmlhrtvp GRP_0 telephony_software issue telephony_software issue telephony software issue telephony software issue en telephony software issue telephony software issue
8497 oybwdsgx oxyhwrfz GRP_0 vip2: windows password reset for tifpdchb pedx... vip windows password reset for tifpdchb pedxru... en vip windows password reset for tifpdchb pedxru...
8498 ufawcgob aowhxjky GRP_62 i am unable to access the machine utilities to... i am unable to access the machine utilities to... en i at the unable to access the machine utilitie...
8499 kqvbrspl jyzoklfx GRP_49 an mehreren pc`s lassen sich verschiedene prgr... an mehreren pc lassen sich verschiedene prgram... de at more pc to let themselves various prgramdnt...
In [38]:
clean_data[clean_data.language == 'de']
Out[38]:
Caller Assignment group New Description Clean_Description language Translated Text
133 bihypqsn kbaegpcd GRP_0 install driver in printer hr14 in HostName_769... install driver in printer hr in HostName insta... de install driver in printer hr in HostName insta...
223 vrfpyjwi nzhvgqiw GRP_24 hallo ,\n\nes ist erneut passiert. der pc hat ... hallo es ist erneut passiert der pc hat sich z... de Hello it is again happens the pc Has themselve...
265 fcyuqvoj ajqeidlm GRP_0 \n\nreceived from: fcyuqvoj.ajqeidlm@gmail.com... hallo netweaver funktioniert nicht mehr bzw ka... de Hello netweaver works Not more bzw can I Not m...
272 lpfwkotn keycvxsl GRP_0 drucker / scanner em85678\n \n scanner findet ... drucker scanner em scanner findet pfad nicht m... de drucker scanner em scanner finds path Not more...
304 wrcktgbd wzrgyunp GRP_24 alte eq abholen \wrcktgbd wzrgyunp alte eq abh... alte eq abholen wrcktgbd wzrgyunp alte eq abho... de old eq pick up wrcktgbd wzrgyunp old eq pick u...
... ... ... ... ... ... ...
8425 wfbkucds qaxhbois GRP_0 ich weiß mein erp passwort nicht mehr und habe... ich wei mein erp passwort nicht mehr und habe ... de I wei my erp password Not more and have fehlve...
8432 ZkBogxib QsEJzdZO GRP_8 received from: monitoring_tool@company.com\n\n... abended job in job scheduler Job at abended jo... de abended job in job scheduler Job at abended jo...
8439 kiqrvwat gwkpxzyt GRP_33 der drucker steht am platz von wckrxovs aunsgz... der drucker steht am platz von wckrxovs aunsgz... de the drucker stands at the space from wckrxovs ...
8448 ZkBogxib QsEJzdZO GRP_8 received from: monitoring_tool@company.com\n\n... abended job in job scheduler Job at abended jo... de abended job in job scheduler Job at abended jo...
8499 kqvbrspl jyzoklfx GRP_49 an mehreren pc`s lassen sich verschiedene prgr... an mehreren pc lassen sich verschiedene prgram... de at more pc to let themselves various prgramdnt...

404 rows × 6 columns

Data Augmentation

In [39]:
!pip3 install nltk
import nltk 
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import wordnet
Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com/simple, https://urm.nvidia.com/artifactory/api/pypi/sw-colossus-pypi/simple
Requirement already satisfied: nltk in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (3.4.4)
Requirement already satisfied: six in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from nltk) (1.15.0)
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aroy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aroy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
In [40]:
from collections import OrderedDict
from nltk.tokenize import word_tokenize
def find_synonyms(word):
  synonyms = []
  for synset in wordnet.synsets(word):
    for syn in synset.lemma_names():
      synonyms.append(syn)

  # using this to drop duplicates while maintaining word order (closest synonyms comes first)
  synonyms_without_duplicates = list(OrderedDict.fromkeys(synonyms))
  return synonyms_without_duplicates
In [41]:
def create_set_of_new_sentences(sentence, max_syn_per_word = 3):
  new_sentences = []
  for word in word_tokenize(sentence):
    if len(word)<=3 : continue 
    for synonym in find_synonyms(word)[0:max_syn_per_word]:
      synonym = synonym.replace('_', ' ') #restore space character
      new_sentence = sentence.replace(word,synonym)
      new_sentences.append(new_sentence)
  return new_sentences
In [42]:
med_records=['GRP_8','GRP_3','GRP_12','GRP_2','GRP_13','GRP_19']

low_records=['GRP_24','GRP_9','GRP_6','GRP_10','GRP_5','GRP_14','GRP_25','GRP_33','GRP_4','GRP_29','GRP_18','GRP_16','GRP_17','GRP_31','GRP_7','GRP_34','GRP_26','GRP_40','GRP_28','GRP_41'
,'GRP_15','GRP_30','GRP_42','GRP_20','GRP_45','GRP_22','GRP_1','GRP_11']

vlow_records =['GRP_21','GRP_47','GRP_23','GRP_62','GRP_48','GRP_60','GRP_39','GRP_27','GRP_37','GRP_44','GRP_36','GRP_50','GRP_53','GRP_65','GRP_53','GRP_52','GRP_55','GRP_51','GRP_59','GRP_49','GRP_46','GRP_43','GRP_66','GRP_32','GRP_63','GRP_58','GRP_56','GRP_38','GRP_68','GRP_69','GRP_57','GRP_72','GRP_71','GRP_54','GRP_35','GRP_64','GRP_70','GRP_61','GRP_67','GRP_73']

clean_data1 = clean_data[clean_data["Assignment group"].isin(med_records)]
clean_data2 = clean_data[clean_data["Assignment group"].isin(low_records)]
clean_data3 = clean_data[clean_data["Assignment group"] .isin(vlow_records)]

clean_data4 = clean_data[clean_data["Assignment group"] == 'GRP_0']
In [43]:
clean_data1
Out[43]:
Caller Assignment group New Description Clean_Description language Translated Text
17 sigfdwcj reofwzlm GRP_3 when undocking pc , screen will not come back ... when undocking pc screen will not come back wh... en When undocking pc screen want distress come ba...
50 bpctwhsn kzqsbmtp GRP_8 received from: monitoring_tool@company.com\n\n... job mm zscr dly merktc failed in job scheduler... en job mm zscr dly merktc failed in job scheduler...
59 bpctwhsn kzqsbmtp GRP_8 received from: monitoring_tool@company.com\n\n... job mm zscr dly merktc failed in job scheduler... en job mm zscr dly merktc failed in job scheduler...
60 bpctwhsn kzqsbmtp GRP_8 received from: monitoring_tool@company.com\n\n... job Job failed in job scheduler at job Job fai... en job Job failed in job scheduler at job Job fai...
64 utyeofsk rdyzpwhi GRP_8 apac company : two switches are down since 3.4... apac company two switches are down since am et... en apac company two switches are down since at th...
... ... ... ... ... ... ...
8466 ZkBogxib QsEJzdZO GRP_8 received from: monitoring_tool@company.com\n\n... abended job in job scheduler bkwin HostName in... en abended job in job scheduler bkwin HostName in...
8482 rkupnshb gsmzfojw GRP_8 robot HostName_776 is inactive robot HostName_... robot HostName is inactive robot HostName is i... en robot HostName is inactive robot HostName is i...
8484 hugcadrn ixhlwdgt GRP_2 please remove user hugcadrn ixhlwdgt (ralfteim... please remove user hugcadrn ixhlwdgt ralfteimp... en please remove user hugcadrn ixhlwdgt ralfteimp...
8487 pvbomqht smfkuhwi GRP_3 pc received multiple windows security updates ... pc received multiple windows security updates ... en pc received multiple windows security updates ...
8494 cpmaidhj elbaqmtp GRP_3 tablet needs reimaged due to multiple issues w... tablet needs reimaged due to multiple issues w... en tablet needs reimaged due to multiple issues w...

1666 rows × 6 columns

In [44]:
clean_data2
Out[44]:
Caller Assignment group New Description Clean_Description language Translated Text
6 jyoqwxhz clhxsoqy GRP_1 event: critical:HostName_221.company.com the v... event critical HostName company com the value ... en event critical HostName company com the value ...
32 kxsceyzo naokumlb GRP_4 \n\nreceived from: kxsceyzo.naokumlb@gmail.com... gentles have two devices that are trying to sh... en gentles have two devices did are trying to sha...
43 yisohglr uvteflgb GRP_5 \n\nreceived from: yisohglr.uvteflgb@gmail.com... hi the printer printer is not working and need... en Hi the printer printer is distress working and...
47 bpctwhsn kzqsbmtp GRP_6 received from: monitoring_tool@company.com\n\n... job Job failed in job scheduler at job Job fai... en job Job failed in job scheduler at job Job fai...
49 aofnvyzt eqiyskhm GRP_7 when closing a call, the agent keeps on the "o... when closing call the agent keeps on the on ac... en When closing call the agent keeps on the on ac...
... ... ... ... ... ... ...
8481 eagvusbr nguqityl GRP_9 \n\nreceived from: eagvusbr.nguqityl@gmail.com... hi team was going into the ess file and checki... en Hi team What going into the ess file and check...
8490 mpihysnw wrctgoan GRP_29 please contact ed pasgryowski (pasgryo) about ... please contact ed pasgryowski pasgryo about hi... en please contact ed pasgryowski pasgryo about hi...
8491 jxgobwrm qkugdipo GRP_34 \n\nreceived from: jxgobwrm.qkugdipo@gmail.com... i need vpn for my new laptop name llv knethyen... en i need vpn for my new Laptop Surname llv kneth...
8493 ipwjorsc uboapexr GRP_10 i am sorry, i have another two accounts that n... i am sorry have another two accounts that need... en i at the sorry have another two accounts did n...
8495 avglmrts vhqmtiua GRP_29 \n\nreceived from: avglmrts.vhqmtiua@gmail.com... good afternoon am not receiving the emails tha... en good afternoon at the distress receiving the e...

2388 rows × 6 columns

In [45]:
clean_data3
Out[45]:
Caller Assignment group New Description Clean_Description language Translated Text
197 uyrpdvoq mbzevtcx GRP_21 i need to approve the new product requests bel... i need to approve the new product requests bel... en i need to approve the new product requests bel...
206 ajgnibkx zixmcjgu GRP_23 unable to see the current course in ethics\n\n... unable to see the current course in ethics use... en unable to lake the current course in ethics us...
247 tphbruoq xtukhnym GRP_27 ic: welcome, our next available agent will be ... welcome our next available agent will be with ... en welcome our next available agent want be with ...
250 obanjrhg rnafleys GRP_27 name:obanjrhg rnafleys\nlanguage:\nbrowser:mic... name obanjrhg rnafleys language browser micros... en Surname obanjrhg rnafleys language browser mic...
298 stdezpqw bkmeuhfz GRP_21 internal users are unable to download discount... internal users are unable to download discount... en internal users are unable to download discount...
... ... ... ... ... ... ...
8377 crkdjbot qiztrxne GRP_23 please turn off eligibility for ethics for use... please turn off eligibility for ethics for use... en please turn off eligibility for ethics for use...
8387 fumkcsji sarmtlhy GRP_72 ticket (ticket_no1538972) update to anftgup nf... ticket ticket no update to anftgup nftgyair ti... en ticket ticket no update to anftgup nftgyair ti...
8396 zuyimtsf qjtimdsp GRP_62 formatar micro formatar micro formatar micro formatar micro ro formatar micro formatar micro
8498 ufawcgob aowhxjky GRP_62 i am unable to access the machine utilities to... i am unable to access the machine utilities to... en i at the unable to access the machine utilitie...
8499 kqvbrspl jyzoklfx GRP_49 an mehreren pc`s lassen sich verschiedene prgr... an mehreren pc lassen sich verschiedene prgram... de at more pc to let themselves various prgramdnt...

315 rows × 6 columns

In [46]:
clean_data4
Out[46]:
Caller Assignment group New Description Clean_Description language Translated Text
0 spxjnwir pjlcoqds GRP_0 -verified user details.(employee# & manager na... verified user details employee and manager nam... en verified user details employee and Manager Sur...
1 hmjdrvpb komuaywn GRP_0 \n\nreceived from: hmjdrvpb.komuaywn@gmail.com... hello team my meetings skype meetings etc are ... en hello team my meetings skype meetings etc are ...
2 eylqgodm ybqkwiam GRP_0 \n\nreceived from: eylqgodm.ybqkwiam@gmail.com... hi cannot log on to vpn best cant log in to vpn en Hi cannot log on to vpn best cant log in to vpn
3 xbkucsvz gcpydteq GRP_0 unable to access hr_tool page unable to access... unable to access hr tool page unable to access... en unable to access hr tool page unable to access...
4 owlgqjme qhcozdfx GRP_0 skype error skype error skype error skype error no skype error skype error
... ... ... ... ... ... ...
8488 rbozivdq gmlhrtvp GRP_0 name:mfeyouli ndobtzpw\nlanguage:\nbrowser:mic... name mfeyouli ndobtzpw language browser micros... en Surname mfeyouli ndobtzpw language browser mic...
8489 sdvlxbfe ptnahjkw GRP_0 account locked account locked account locked account locked en account locked account locked
8492 tmopbken ibzougsd GRP_0 hr_tool etime option not visitble hr_tool eti... hr tool etime option not visitble hr tool etim... en hr tool etime option distress visitble hr tool...
8496 rbozivdq gmlhrtvp GRP_0 telephony_software issue telephony_software issue telephony software issue telephony software issue en telephony software issue telephony software issue
8497 oybwdsgx oxyhwrfz GRP_0 vip2: windows password reset for tifpdchb pedx... vip windows password reset for tifpdchb pedxru... en vip windows password reset for tifpdchb pedxru...

3834 rows × 6 columns

In [47]:
maxsyn=1
#clean_data1["Augmented_data"] = clean_data1["Translated Text"].apply(create_set_of_new_sentences)
clean_data1["Augmented_data"] = clean_data1.apply(lambda x: create_set_of_new_sentences(x['Translated Text'], maxsyn),axis=1)

clean_data1
Out[47]:
Caller Assignment group New Description Clean_Description language Translated Text Augmented_data
17 sigfdwcj reofwzlm GRP_3 when undocking pc , screen will not come back ... when undocking pc screen will not come back wh... en When undocking pc screen want distress come ba... [When undock pc screen want distress come back...
50 bpctwhsn kzqsbmtp GRP_8 received from: monitoring_tool@company.com\n\n... job mm zscr dly merktc failed in job scheduler... en job mm zscr dly merktc failed in job scheduler... [job mm zscr dly merktc fail in job scheduler ...
59 bpctwhsn kzqsbmtp GRP_8 received from: monitoring_tool@company.com\n\n... job mm zscr dly merktc failed in job scheduler... en job mm zscr dly merktc failed in job scheduler... [job mm zscr dly merktc fail in job scheduler ...
60 bpctwhsn kzqsbmtp GRP_8 received from: monitoring_tool@company.com\n\n... job Job failed in job scheduler at job Job fai... en job Job failed in job scheduler at job Job fai... [job Job fail in job scheduler at job Job fail...
64 utyeofsk rdyzpwhi GRP_8 apac company : two switches are down since 3.4... apac company two switches are down since am et... en apac company two switches are down since at th... [apac company two switches are down since at t...
... ... ... ... ... ... ... ...
8466 ZkBogxib QsEJzdZO GRP_8 received from: monitoring_tool@company.com\n\n... abended job in job scheduler bkwin HostName in... en abended job in job scheduler bkwin HostName in... [abended job in job scheduler bkwin HostName i...
8482 rkupnshb gsmzfojw GRP_8 robot HostName_776 is inactive robot HostName_... robot HostName is inactive robot HostName is i... en robot HostName is inactive robot HostName is i... [automaton HostName is inactive automaton Host...
8484 hugcadrn ixhlwdgt GRP_2 please remove user hugcadrn ixhlwdgt (ralfteim... please remove user hugcadrn ixhlwdgt ralfteimp... en please remove user hugcadrn ixhlwdgt ralfteimp... [please remove user hugcadrn ixhlwdgt ralfteim...
8487 pvbomqht smfkuhwi GRP_3 pc received multiple windows security updates ... pc received multiple windows security updates ... en pc received multiple windows security updates ... [pc receive multiple windows security updates ...
8494 cpmaidhj elbaqmtp GRP_3 tablet needs reimaged due to multiple issues w... tablet needs reimaged due to multiple issues w... en tablet needs reimaged due to multiple issues w... [tablet needs reimaged due to multiple issues ...

1666 rows × 7 columns

In [48]:
s = clean_data1.apply(lambda x: pd.Series(x['Augmented_data']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'Final_Text'
clean_data_aug1 = clean_data1.drop(['New Description','Augmented_data', 'Clean_Description', 'Translated Text'],axis=1).join(s)
C:\Users\aroy\Anaconda3\envs\myenv\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning:

The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.

In [49]:
init_notebook_mode()
cf.go_offline()

# Assignment group distribution
print('\033[1mTotal assignment groups:\033[0m', clean_data_aug1['Assignment group'].nunique())

# Histogram
clean_data_aug1['Assignment group'].iplot(
    kind='hist',
    xTitle='Assignment Group',
    yTitle='count',
    title='Assignment Group Distribution- Histogram (Fig-1)')
Total assignment groups: 6
In [50]:
maxsyn=6
clean_data2["Augmented_data"] = clean_data2.apply(lambda x: create_set_of_new_sentences(x['Translated Text'], maxsyn),axis=1)

clean_data2
Out[50]:
Caller Assignment group New Description Clean_Description language Translated Text Augmented_data
6 jyoqwxhz clhxsoqy GRP_1 event: critical:HostName_221.company.com the v... event critical HostName company com the value ... en event critical HostName company com the value ... [event critical HostName company com the value...
32 kxsceyzo naokumlb GRP_4 \n\nreceived from: kxsceyzo.naokumlb@gmail.com... gentles have two devices that are trying to sh... en gentles have two devices did are trying to sha... [pacify have two devices did are trying to sha...
43 yisohglr uvteflgb GRP_5 \n\nreceived from: yisohglr.uvteflgb@gmail.com... hi the printer printer is not working and need... en Hi the printer printer is distress working and... [Hi the printer printer is distress working an...
47 bpctwhsn kzqsbmtp GRP_6 received from: monitoring_tool@company.com\n\n... job Job failed in job scheduler at job Job fai... en job Job failed in job scheduler at job Job fai... [job Job fail in job scheduler at job Job fail...
49 aofnvyzt eqiyskhm GRP_7 when closing a call, the agent keeps on the "o... when closing call the agent keeps on the on ac... en When closing call the agent keeps on the on ac... [When shutting call the agent keeps on the on ...
... ... ... ... ... ... ... ...
8481 eagvusbr nguqityl GRP_9 \n\nreceived from: eagvusbr.nguqityl@gmail.com... hi team was going into the ess file and checki... en Hi team What going into the ess file and check... [Hi team What going into the ess file and chec...
8490 mpihysnw wrctgoan GRP_29 please contact ed pasgryowski (pasgryo) about ... please contact ed pasgryowski pasgryo about hi... en please contact ed pasgryowski pasgryo about hi... [please contact ed pasgryowski pasgryo about h...
8491 jxgobwrm qkugdipo GRP_34 \n\nreceived from: jxgobwrm.qkugdipo@gmail.com... i need vpn for my new laptop name llv knethyen... en i need vpn for my new Laptop Surname llv kneth... [i need vpn for my new Laptop Surname llv knet...
8493 ipwjorsc uboapexr GRP_10 i am sorry, i have another two accounts that n... i am sorry have another two accounts that need... en i at the sorry have another two accounts did n... [i at the regretful have another two accounts ...
8495 avglmrts vhqmtiua GRP_29 \n\nreceived from: avglmrts.vhqmtiua@gmail.com... good afternoon am not receiving the emails tha... en good afternoon at the distress receiving the e... [good afternoon at the distress receiving the ...

2388 rows × 7 columns

In [51]:
s = clean_data2.apply(lambda x: pd.Series(x['Augmented_data']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'Final_Text'
clean_data_aug2 = clean_data2.drop(['New Description','Augmented_data', 'Clean_Description', 'Translated Text'],axis=1).join(s)
C:\Users\aroy\Anaconda3\envs\myenv\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning:

The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.

In [52]:
# Assignment group distribution
print('\033[1mTotal assignment groups:\033[0m', clean_data_aug2['Assignment group'].nunique())

# Histogram
clean_data_aug2['Assignment group'].iplot(
    kind='hist',
    xTitle='Assignment Group',
    yTitle='count',
    title='Assignment Group Distribution- Histogram (Fig-2)')
Total assignment groups: 28
In [53]:
maxsyn=10
clean_data3["Augmented_data"] = clean_data3.apply(lambda x: create_set_of_new_sentences(x['Translated Text'], maxsyn),axis=1)

clean_data3
Out[53]:
Caller Assignment group New Description Clean_Description language Translated Text Augmented_data
197 uyrpdvoq mbzevtcx GRP_21 i need to approve the new product requests bel... i need to approve the new product requests bel... en i need to approve the new product requests bel... [i need to approve the new product requests be...
206 ajgnibkx zixmcjgu GRP_23 unable to see the current course in ethics\n\n... unable to see the current course in ethics use... en unable to lake the current course in ethics us... [unable to lake the current course in ethics u...
247 tphbruoq xtukhnym GRP_27 ic: welcome, our next available agent will be ... welcome our next available agent will be with ... en welcome our next available agent want be with ... [welcome our next available agent want be with...
250 obanjrhg rnafleys GRP_27 name:obanjrhg rnafleys\nlanguage:\nbrowser:mic... name obanjrhg rnafleys language browser micros... en Surname obanjrhg rnafleys language browser mic... [surname obanjrhg rnafleys language browser mi...
298 stdezpqw bkmeuhfz GRP_21 internal users are unable to download discount... internal users are unable to download discount... en internal users are unable to download discount... [internal users are unable to download discoun...
... ... ... ... ... ... ... ...
8377 crkdjbot qiztrxne GRP_23 please turn off eligibility for ethics for use... please turn off eligibility for ethics for use... en please turn off eligibility for ethics for use... [please turn off eligibility for ethics for us...
8387 fumkcsji sarmtlhy GRP_72 ticket (ticket_no1538972) update to anftgup nf... ticket ticket no update to anftgup nftgyair ti... en ticket ticket no update to anftgup nftgyair ti... [ticket ticket no update to anftgup nftgyair t...
8396 zuyimtsf qjtimdsp GRP_62 formatar micro formatar micro formatar micro formatar micro ro formatar micro formatar micro [formatar micro formatar micro, formatar micro...
8498 ufawcgob aowhxjky GRP_62 i am unable to access the machine utilities to... i am unable to access the machine utilities to... en i at the unable to access the machine utilitie... [i at the unable to access the machine utiliti...
8499 kqvbrspl jyzoklfx GRP_49 an mehreren pc`s lassen sich verschiedene prgr... an mehreren pc lassen sich verschiedene prgram... de at more pc to let themselves various prgramdnt... [at More pc to let themselves various prgramdn...

315 rows × 7 columns

In [54]:
s = clean_data3.apply(lambda x: pd.Series(x['Augmented_data']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'Final_Text'
clean_data_aug3 = clean_data3.drop(['New Description','Augmented_data', 'Clean_Description', 'Translated Text'],axis=1).join(s)
C:\Users\aroy\Anaconda3\envs\myenv\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning:

The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.

In [55]:
# Assignment group distribution
print('\033[1mTotal assignment groups:\033[0m', clean_data_aug3['Assignment group'].nunique())

# Histogram
clean_data_aug3['Assignment group'].iplot(
    kind='hist',
    xTitle='Assignment Group',
    yTitle='count',
    title='Assignment Group Distribution- Histogram (Fig-4)')
Total assignment groups: 27
In [56]:
maxsyn=1
clean_data4["Augmented_data"] = clean_data4.apply(lambda x: create_set_of_new_sentences(x['Translated Text'], maxsyn),axis=1)

clean_data4
Out[56]:
Caller Assignment group New Description Clean_Description language Translated Text Augmented_data
0 spxjnwir pjlcoqds GRP_0 -verified user details.(employee# & manager na... verified user details employee and manager nam... en verified user details employee and Manager Sur... [verify user details employee and Manager Surn...
1 hmjdrvpb komuaywn GRP_0 \n\nreceived from: hmjdrvpb.komuaywn@gmail.com... hello team my meetings skype meetings etc are ... en hello team my meetings skype meetings etc are ... [hello team my meetings skype meetings etc are...
2 eylqgodm ybqkwiam GRP_0 \n\nreceived from: eylqgodm.ybqkwiam@gmail.com... hi cannot log on to vpn best cant log in to vpn en Hi cannot log on to vpn best cant log in to vpn [Hi cannot log on to vpn best cant log in to v...
3 xbkucsvz gcpydteq GRP_0 unable to access hr_tool page unable to access... unable to access hr tool page unable to access... en unable to access hr tool page unable to access... [unable to access hr tool page unable to acces...
4 owlgqjme qhcozdfx GRP_0 skype error skype error skype error skype error no skype error skype error [skype mistake skype mistake, skype mistake sk...
... ... ... ... ... ... ... ...
8488 rbozivdq gmlhrtvp GRP_0 name:mfeyouli ndobtzpw\nlanguage:\nbrowser:mic... name mfeyouli ndobtzpw language browser micros... en Surname mfeyouli ndobtzpw language browser mic... [surname mfeyouli ndobtzpw language browser mi...
8489 sdvlxbfe ptnahjkw GRP_0 account locked account locked account locked account locked en account locked account locked [history locked history locked, account lock a...
8492 tmopbken ibzougsd GRP_0 hr_tool etime option not visitble hr_tool eti... hr tool etime option not visitble hr tool etim... en hr tool etime option distress visitble hr tool... [hr tool etime option distress visitble hr too...
8496 rbozivdq gmlhrtvp GRP_0 telephony_software issue telephony_software issue telephony software issue telephony software issue en telephony software issue telephony software issue [telephone software issue telephone software i...
8497 oybwdsgx oxyhwrfz GRP_0 vip2: windows password reset for tifpdchb pedx... vip windows password reset for tifpdchb pedxru... en vip windows password reset for tifpdchb pedxru... [vip Windows password reset for tifpdchb pedxr...

3834 rows × 7 columns

In [57]:
s = clean_data4.apply(lambda x: pd.Series(x['Augmented_data']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'Final_Text'
clean_data_aug4 = clean_data4.drop(['New Description','Augmented_data', 'Clean_Description', 'Translated Text'],axis=1).join(s)
C:\Users\aroy\Anaconda3\envs\myenv\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning:

The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.

In [58]:
# Assignment group distribution
print('\033[1mTotal assignment groups:\033[0m', clean_data_aug4['Assignment group'].nunique())

# Histogram
clean_data_aug4['Assignment group'].iplot(
    kind='hist',
    xTitle='Assignment Group',
    yTitle='count',
    title='Assignment Group Distribution- Histogram (Fig-5)')
Total assignment groups: 1

clean_data_mod4 = clean_data4.drop(['New Description', 'Clean_Description'],axis=1) clean_data_mod4.rename(columns={'Translated Text': 'Final_Text'}, inplace=True) clean_data_mod4.head()

In [59]:
dataframes=[clean_data_aug1,clean_data_aug2,clean_data_aug3,clean_data_aug4]
clean_data_result= pd.concat(dataframes)
In [60]:
# Assignment group distribution
print('\033[1mTotal assignment groups:\033[0m', clean_data_result['Assignment group'].nunique())

# Histogram
clean_data_result['Assignment group'].iplot(
    kind='hist',
    xTitle='Assignment Group',
    yTitle='count',
    title='Assignment Group Distribution- Histogram (Fig-5)')
Total assignment groups: 62

Stop words removal and Lemmatise text

In [61]:
#Stop words removal
nltk.download('stopwords')
from nltk.corpus import stopwords

sr = stopwords.words('english')
for i,text in enumerate(clean_data_result['Final_Text']):
    clean_data_result['Final_Text'][i]=" ".join(word for word in text.split(' ') if word not in sr)
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aroy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
In [62]:
clean_data_result
Out[62]:
Caller Assignment group language Final_Text
17 sigfdwcj reofwzlm GRP_3 en job mm zscr dly merktc failed job scheduler jo...
17 sigfdwcj reofwzlm GRP_3 en job mm zscr dly merktc failed job scheduler jo...
17 sigfdwcj reofwzlm GRP_3 en job mm zscr dly merktc failed job scheduler jo...
17 sigfdwcj reofwzlm GRP_3 en job mm zscr dly merktc failed job scheduler jo...
17 sigfdwcj reofwzlm GRP_3 en job mm zscr dly merktc failed job scheduler jo...
... ... ... ... ...
8497 oybwdsgx oxyhwrfz GRP_0 en type outage network circuit power please speci...
8497 oybwdsgx oxyhwrfz GRP_0 en type outage network circuit power please speci...
8497 oybwdsgx oxyhwrfz GRP_0 en type outage network circuit power please speci...
8497 oybwdsgx oxyhwrfz GRP_0 en type outage network circuit power please speci...
8497 oybwdsgx oxyhwrfz GRP_0 en type outage network circuit power please speci...

306006 rows × 4 columns

In [63]:
#Lemmatisation using spacy library
!pip install spacy
Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com/simple, https://urm.nvidia.com/artifactory/api/pypi/sw-colossus-pypi/simple
Requirement already satisfied: spacy in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (2.3.4)
Requirement already satisfied: cymem<2.1.0,>=2.0.2 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from spacy) (2.0.5)
Requirement already satisfied: numpy>=1.15.0 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from spacy) (1.19.1)
Requirement already satisfied: thinc<7.5.0,>=7.4.1 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from spacy) (7.4.4)
Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from spacy) (4.54.1)
Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from spacy) (1.0.0)
Requirement already satisfied: setuptools in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from spacy) (49.6.0.post20200814)
Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from spacy) (1.0.5)
Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from spacy) (0.8.0)
Requirement already satisfied: plac<1.2.0,>=0.9.6 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from spacy) (1.1.3)
Requirement already satisfied: blis<0.8.0,>=0.4.0; python_version >= "3.6" in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from spacy) (0.7.4)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from spacy) (2.24.0)
Requirement already satisfied: preshed<3.1.0,>=3.0.2 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from spacy) (3.0.5)
Requirement already satisfied: srsly<1.1.0,>=1.0.2 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from spacy) (1.0.5)
Requirement already satisfied: importlib-metadata>=0.20; python_version < "3.8" in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from catalogue<1.1.0,>=0.0.7->spacy) (1.7.0)
Requirement already satisfied: idna<3,>=2.5 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.10)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy) (2020.11.8)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy) (1.25.10)
Requirement already satisfied: chardet<4,>=3.0.2 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.0.4)
Requirement already satisfied: zipp>=0.5 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from importlib-metadata>=0.20; python_version < "3.8"->catalogue<1.1.0,>=0.0.7->spacy) (3.1.0)
In [64]:
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com/simple, https://urm.nvidia.com/artifactory/api/pypi/sw-colossus-pypi/simple
Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz (12.0 MB)
Requirement already satisfied (use --upgrade to upgrade): en-core-web-sm==2.3.1 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages
Requirement already satisfied: spacy<2.4.0,>=2.3.0 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from en-core-web-sm==2.3.1) (2.3.4)
Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from spacy<2.4.0,>=2.3.0->en-core-web-sm==2.3.1) (4.54.1)
Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from spacy<2.4.0,>=2.3.0->en-core-web-sm==2.3.1) (0.8.0)
Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from spacy<2.4.0,>=2.3.0->en-core-web-sm==2.3.1) (1.0.5)
Requirement already satisfied: setuptools in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from spacy<2.4.0,>=2.3.0->en-core-web-sm==2.3.1) (49.6.0.post20200814)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from spacy<2.4.0,>=2.3.0->en-core-web-sm==2.3.1) (2.24.0)
Requirement already satisfied: numpy>=1.15.0 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from spacy<2.4.0,>=2.3.0->en-core-web-sm==2.3.1) (1.19.1)
Requirement already satisfied: srsly<1.1.0,>=1.0.2 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from spacy<2.4.0,>=2.3.0->en-core-web-sm==2.3.1) (1.0.5)
Requirement already satisfied: blis<0.8.0,>=0.4.0; python_version >= "3.6" in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from spacy<2.4.0,>=2.3.0->en-core-web-sm==2.3.1) (0.7.4)
Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from spacy<2.4.0,>=2.3.0->en-core-web-sm==2.3.1) (1.0.0)
Requirement already satisfied: thinc<7.5.0,>=7.4.1 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from spacy<2.4.0,>=2.3.0->en-core-web-sm==2.3.1) (7.4.4)
Requirement already satisfied: plac<1.2.0,>=0.9.6 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from spacy<2.4.0,>=2.3.0->en-core-web-sm==2.3.1) (1.1.3)
Requirement already satisfied: preshed<3.1.0,>=3.0.2 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from spacy<2.4.0,>=2.3.0->en-core-web-sm==2.3.1) (3.0.5)
Requirement already satisfied: cymem<2.1.0,>=2.0.2 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from spacy<2.4.0,>=2.3.0->en-core-web-sm==2.3.1) (2.0.5)
Requirement already satisfied: chardet<4,>=3.0.2 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy<2.4.0,>=2.3.0->en-core-web-sm==2.3.1) (3.0.4)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy<2.4.0,>=2.3.0->en-core-web-sm==2.3.1) (1.25.10)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy<2.4.0,>=2.3.0->en-core-web-sm==2.3.1) (2020.11.8)
Requirement already satisfied: idna<3,>=2.5 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy<2.4.0,>=2.3.0->en-core-web-sm==2.3.1) (2.10)
Requirement already satisfied: importlib-metadata>=0.20; python_version < "3.8" in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from catalogue<1.1.0,>=0.0.7->spacy<2.4.0,>=2.3.0->en-core-web-sm==2.3.1) (1.7.0)
Requirement already satisfied: zipp>=0.5 in c:\users\aroy\anaconda3\envs\myenv\lib\site-packages (from importlib-metadata>=0.20; python_version < "3.8"->catalogue<1.1.0,>=0.0.7->spacy<2.4.0,>=2.3.0->en-core-web-sm==2.3.1) (3.1.0)
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py): started
  Building wheel for en-core-web-sm (setup.py): finished with status 'done'
  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.3.1-py3-none-any.whl size=12047113 sha256=7a257dfb4c18fbea572d161f063735dc21f335d07d1b9a9647b8a8e5b22d7705
  Stored in directory: c:\users\aroy\appdata\local\pip\cache\wheels\10\6f\a6\ddd8204ceecdedddea923f8514e13afb0c1f0f556d2c9c3da0
Successfully built en-core-web-sm
In [65]:
# Need to run "python -m spacy download en" in anaconda prompt to avoid 'en' not found issue.
In [66]:
import spacy
nlp = spacy.load('en', disable=['parser', 'ner'])
allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
def lemmatize_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc])

clean_data_result['Final_Text'] = clean_data_result['Final_Text'].apply(lemmatize_text)
In [67]:
clean_data_result
Out[67]:
Caller Assignment group language Final_Text
17 sigfdwcj reofwzlm GRP_3 en job mm zscr dly merktc fail job scheduler job ...
17 sigfdwcj reofwzlm GRP_3 en job mm zscr dly merktc fail job scheduler job ...
17 sigfdwcj reofwzlm GRP_3 en job mm zscr dly merktc fail job scheduler job ...
17 sigfdwcj reofwzlm GRP_3 en job mm zscr dly merktc fail job scheduler job ...
17 sigfdwcj reofwzlm GRP_3 en job mm zscr dly merktc fail job scheduler job ...
... ... ... ... ...
8497 oybwdsgx oxyhwrfz GRP_0 en type outage network circuit power please speci...
8497 oybwdsgx oxyhwrfz GRP_0 en type outage network circuit power please speci...
8497 oybwdsgx oxyhwrfz GRP_0 en type outage network circuit power please speci...
8497 oybwdsgx oxyhwrfz GRP_0 en type outage network circuit power please speci...
8497 oybwdsgx oxyhwrfz GRP_0 en type outage network circuit power please speci...

306006 rows × 4 columns

Attempt to use Google Translate library

In [68]:
#!pip install goslate
In [69]:
'''# Define and construct the service urls
domains = ['.com','.com.au','.com.ar','.co.kr','.co.in','.co.jp','.at','.de','.ru','.ch','.fr','.es','.ae']
urls = ['http://translate.google' + domain for domain in domains]'''
Out[69]:
"# Define and construct the service urls\ndomains = ['.com','.com.au','.com.ar','.co.kr','.co.in','.co.jp','.at','.de','.ru','.ch','.fr','.es','.ae']\nurls = ['http://translate.google' + domain for domain in domains]"
In [70]:
"""from goslate import Goslate # Provided by Google
import random

# List of column data to consider for translation
trans_cols = ['Clean_Description']

for idx in range(clean_data.shape[0]):
    # Instantiate Goslate class in each iteration
    gs = Goslate(service_urls=random.choice(urls))
    row_iter = gs.translate(clean_data.loc[idx, trans_cols].tolist(), 
                            target_language='en', 
                            source_language='auto')
    clean_data.loc[idx, trans_cols] = list(row_iter)
    time.sleep(30)
    
clean_data.tail()"""
Out[70]:
"from goslate import Goslate # Provided by Google\nimport random\n\n# List of column data to consider for translation\ntrans_cols = ['Clean_Description']\n\nfor idx in range(clean_data.shape[0]):\n    # Instantiate Goslate class in each iteration\n    gs = Goslate(service_urls=random.choice(urls))\n    row_iter = gs.translate(clean_data.loc[idx, trans_cols].tolist(), \n                            target_language='en', \n                            source_language='auto')\n    clean_data.loc[idx, trans_cols] = list(row_iter)\n    time.sleep(30)\n    \nclean_data.tail()"
In [71]:
# Serialize the translated dataset
clean_data_result.to_csv('Final_data.csv', index=False, encoding='utf_8_sig')
with open('Final_data.pkl','wb') as f:
    pickle.dump(clean_data_result, f, pickle.HIGHEST_PROTOCOL)
In [72]:
# Load the translated pickle file 
with open('final_data.pkl','rb') as f:
    clean_data = pickle.load(f)

Univariate visualization

Single-variable or univariate visualization is the simplest type of visualization which consists of observations on only a single characteristic or attribute. Univariate visualization includes histogram, bar plots and line charts.

The distribution of Assignment groups

Plots how the assignments groups are scattered across the dataset. The bar chart, histogram and pie chart tells the frequency of any ticket assigned to any group OR the tickets count for each group.

In [73]:
# Assignment group distribution
print('\033[1mTotal assignment groups:\033[0m', clean_data['Assignment group'].nunique())

# Histogram
clean_data['Assignment group'].iplot(
    kind='hist',
    xTitle='Assignment Group',
    yTitle='count',
    title='Assignment Group Distribution- Histogram (Fig-1)')

# Pie chart
assgn_grp = pd.DataFrame(clean_data.groupby('Assignment group').size(),columns = ['Count']).reset_index()
assgn_grp.iplot(
    kind='pie', 
    labels='Assignment group', 
    values='Count', 
    title='Assignment Group Distribution- Pie Chart (Fig-2)', 
    hoverinfo="label+percent+name", hole=0.25)
Total assignment groups: 62

Lets visualize the percentage of incidents per assignment group

In [74]:
# Plot to visualize the percentage data distribution across different groups
sns.set(style="whitegrid")
plt.figure(figsize=(20,5))
ax = sns.countplot(x="Assignment group", data=clean_data, order=clean_data["Assignment group"].value_counts().index)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
for p in ax.patches:
  ax.annotate(str(format(p.get_height()/len(clean_data.index)*100, '.2f')+"%"), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'bottom', rotation=90, xytext = (0, 10), textcoords = 'offset points')
In [75]:
top_20 = clean_data['Assignment group'].value_counts().nlargest(20).reset_index()
In [76]:
plt.figure(figsize=(12,6))
bars = plt.bar(top_20['index'],top_20['Assignment group'])
plt.title('Top 20 Assignment groups with highest number of Tickets')
plt.xlabel('Assignment Group')
plt.xticks(rotation=90)
plt.ylabel('Number of Tickets')

for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x(), yval + .005, yval)
plt.tight_layout()
plt.show()
In [77]:
bottom_20 = clean_data['Assignment group'].value_counts().nsmallest(20).reset_index()
In [78]:
plt.figure(figsize=(12,6))
bars = plt.bar(bottom_20['index'],bottom_20['Assignment group'])
plt.title('Bottom 20 Assignment groups with small number of Tickets')
plt.xlabel('Assignment Group')
plt.xticks(rotation=90)
plt.ylabel('Number of Tickets')
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x(), yval + .005, yval)
plt.tight_layout()
plt.show()

The distribution of Callers

Plots how the callers are associated with tickets and what are the assignment groups they most frequently raise tickets for.

In [79]:
# Find out top 10 callers in terms of frequency of raising tickets in the entire dataset
print('\033[1mTotal caller count:\033[0m', clean_data['Caller'].nunique())
df = pd.DataFrame(clean_data.groupby(['Caller']).size().nlargest(10), columns=['Count']).reset_index()
df.iplot(kind='pie',
         labels='Caller', 
         values='Count', 
         title='Top 10 caller- Pie Chart (Fig-7)',
         colorscale='-spectral',
         pull=[0,0,0,0,0.05,0.1,0.15,0.2,0.25,0.3])
Total caller count: 2862
In [80]:
# Top 5 callers in each assignment group
top_n = 5
s = clean_data['Caller'].groupby(clean_data['Assignment group']).value_counts()
caller_grp = pd.DataFrame(s.groupby(level=0).nlargest(top_n).reset_index(level=0, drop=True))
caller_grp.head(15)
Out[80]:
Caller
Assignment group Caller
GRP_0 fumkcsji sarmtlhy 637
rbozivdq gmlhrtvp 606
efbwiadp dicafxhv 391
yqlvfkih folbpugd 364
qasdhyzm yuglsrwx 297
GRP_1 kbnfxpsy gehxzayq 366
spxqmiry zpwgoqju 276
jloygrwh acvztedi 268
dctvfjrn oypnxftq 244
mnlazfsr mtqrkhnx 228
GRP_10 bpctwhsn kzqsbmtp 1064
dizquolf hlykecxa 544
ihfkwzjd erbxoyqk 498
ikerxqwz prkyuitl 464
ipwjorsc uboapexr 431

The distribution of description lengths

Plots the variation of length and word count of new description attribute

In [81]:
clean_data.insert(1, 'desc_len', clean_data['Final_Text'].astype(str).apply(len))
clean_data.insert(5, 'desc_word_count', clean_data['Final_Text'].apply(lambda x: len(str(x).split())))
clean_data.head()
Out[81]:
Caller desc_len Assignment group language Final_Text desc_word_count
17 sigfdwcj reofwzlm 83 GRP_3 en job mm zscr dly merktc fail job scheduler job ... 16
17 sigfdwcj reofwzlm 83 GRP_3 en job mm zscr dly merktc fail job scheduler job ... 16
17 sigfdwcj reofwzlm 83 GRP_3 en job mm zscr dly merktc fail job scheduler job ... 16
17 sigfdwcj reofwzlm 83 GRP_3 en job mm zscr dly merktc fail job scheduler job ... 16
17 sigfdwcj reofwzlm 83 GRP_3 en job mm zscr dly merktc fail job scheduler job ... 16
In [82]:
# Description text length
clean_data['desc_len'].iplot(
    kind='bar',
    xTitle='text length',
    yTitle='count',
    colorscale='-ylgn',
    title='Description Text Length Distribution (Fig-11)')

# Description word count
clean_data['desc_word_count'].iplot(
    kind='bar',
    xTitle='word count',
    linecolor='black',
    yTitle='count',
    colorscale='-bupu',
    title='Description Word Count Distribution (Fig-12)')

N-Grams

N-gram is a contiguous sequence of N items from a given sample of text or speech, in the fields of computational linguistics and probability. The items can be phonemes, syllables, letters, words or base pairs according to the application. N-grams are used to describe the number of words used as observation points, e.g., unigram means singly-worded, bigram means 2-worded phrase, and trigram means 3-worded phrase.

We'll be using scikit-learn’s CountVectorizer function to derive n-grams and compare them before and after removing stop words. Stop words are a set of commonly used words in any language. We'll be using english corpus stopwords and extend it to include some business specific common words considered to be stop words in our case.

In [83]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.feature_extraction.text import CountVectorizer

# Extend the English Stop Wordss
STOP_WORDS = STOPWORDS.union({'yes','na','hi',
                              'receive','hello',
                              'regards','thanks',
                              'from','greeting',
                              'forward','reply',
                              'will','please',
                              'see','help','able'})

# Generic function to derive top N n-grams from the corpus
def get_top_n_ngrams(corpus, top_n=None, ngram_range=(1,1), stopwords=None):
    vec = CountVectorizer(ngram_range=ngram_range, 
                          stop_words=stopwords).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:top_n]

Top Unigrams

In [84]:
# Top 50 Unigrams before removing stop words
top_n = 50
ngram_range = (1,1)
uni_grams = get_top_n_ngrams(clean_data.Final_Text, top_n, ngram_range)

df = pd.DataFrame(uni_grams, columns = ['Final_Text' , 'count'])
df.groupby('Final_Text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', 
    yTitle='Count', 
    linecolor='black', 
    colorscale='piyg',
    title=f'Top {top_n} Unigrams in Final_Text')

# Top 50 Unigrams after removing stop words
uni_grams_sw = get_top_n_ngrams(clean_data.Final_Text, top_n, ngram_range, stopwords=STOP_WORDS)

df = pd.DataFrame(uni_grams_sw, columns = ['Final_Text' , 'count'])
df.groupby('Final_Text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', 
    yTitle='Count', 
    linecolor='black',
    colorscale='-piyg',
    title=f'Top {top_n} Unigrams in Final_Text without stop words')
C:\Users\aroy\Anaconda3\envs\myenv\lib\site-packages\sklearn\feature_extraction\text.py:386: UserWarning:

Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['aren', 'couldn', 'didn', 'doesn', 'don', 'hadn', 'hasn', 'haven', 'isn', 'let', 'll', 'mustn', 're', 'shan', 'shouldn', 've', 'wasn', 'weren', 'won', 'wouldn'] not in stop_words.

Top Bigrams

In [85]:
# Top 50 Bigrams before removing stop words
top_n = 50
ngram_range = (2,2)
bi_grams = get_top_n_ngrams(clean_data.Final_Text, top_n, ngram_range)

df = pd.DataFrame(bi_grams, columns = ['Final_Text' , 'count'])
df.groupby('Final_Text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', 
    yTitle='Count', 
    linecolor='black', 
    colorscale='piyg',
    title=f'Top {top_n} Bigrams in Final_Text')

# Top 50 Bigrams after removing stop words
bi_grams_sw = get_top_n_ngrams(clean_data.Final_Text, top_n, ngram_range, stopwords=STOP_WORDS)

df = pd.DataFrame(bi_grams_sw, columns = ['Final_Text' , 'count'])
df.groupby('Final_Text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', 
    yTitle='Count', 
    linecolor='black',
    colorscale='-piyg',
    title=f'Top {top_n} Bigrams in Final_Text without stop words')

Top Trigrams

In [86]:
# Top 50 Trigrams before removing stop words
top_n = 50
ngram_range = (3,3)
tri_grams = get_top_n_ngrams(clean_data.Final_Text, top_n, ngram_range)

df = pd.DataFrame(tri_grams, columns = ['Final_Text' , 'count'])
df.groupby('Final_Text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', 
    yTitle='Count', 
    linecolor='black', 
    colorscale='piyg',
    title=f'Top {top_n} Trigrams in Final_Text')

# Top 50 Trigrams after removing stop words
tri_grams_sw = get_top_n_ngrams(clean_data.Final_Text, top_n, ngram_range, stopwords=STOP_WORDS)

df = pd.DataFrame(tri_grams_sw, columns = ['Final_Text' , 'count'])
df.groupby('Final_Text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', 
    yTitle='Count', 
    linecolor='black',
    colorscale='-piyg',
    title=f'Top {top_n} Trigrams in Final_Text without stop words')

Word Cloud

Let us attempt to visualize this as a word cloud for top three groups that has got maximum records. A word cloud enables us to visualize the data as cluster of words and each words displayed in different font size based on the number of occurences of that word . Basically; the bolder and bigger the word show up in the visualization, it implies its more often it’s mentioned within a given text compared to other words in the cloud and therefore would be more important for us.

Let's write a generic method to generate Word Clouds for both Short and Long Description columns.

In [87]:
def generate_word_cloud(corpus):
        # Instantiate the wordcloud object
    wordcloud = WordCloud(width = 800, height = 800, 
                    background_color ='white', 
                    stopwords=STOP_WORDS,
                    # mask=mask,
                    min_font_size = 10).generate(corpus)

    # plot the WordCloud image                        
    plt.figure(figsize = (12, 12), facecolor = None) 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 

    plt.show()
In [88]:
# Word Cloud for all tickets assigned to GRP_0
generate_word_cloud(' '.join(clean_data[clean_data['Assignment group'] == 'GRP_0'].Final_Text.str.strip()))
In [89]:
# Word Cloud for all tickets assigned to GRP_8
generate_word_cloud(' '.join(clean_data[clean_data['Assignment group'] == 'GRP_8'].Final_Text.str.strip()))
In [90]:
# Word Cloud for all tickets assigned to GRP_25
generate_word_cloud(' '.join(clean_data[clean_data['Assignment group'] == 'GRP_25'].Final_Text.str.strip()))
In [91]:
# Generate wordcloud for Final_Text field
generate_word_cloud(' '.join(clean_data.Final_Text.str.strip()))

Prepping Dataframe for Model Building

In [92]:
# Import label encoder 
from sklearn import preprocessing 
  
# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
  
# Encode labels in column 'species'. 
clean_data['Assignment group LabelEncoded']= label_encoder.fit_transform(clean_data['Assignment group']) 
  
clean_data['Assignment group LabelEncoded'].unique()
Out[92]:
array([23, 60,  4,  5, 11, 12,  1, 32, 43, 52, 58, 61,  2,  3,  6,  7,  8,
        9, 10, 13, 15, 17, 19, 21, 22, 24, 25, 18, 27, 28, 33, 34, 35, 38,
       14, 16, 20, 29, 30, 31, 36, 37, 39, 40, 41, 42, 44, 45, 46, 47, 49,
       50, 51, 53, 26, 54, 55, 56, 57, 48, 59,  0])
In [93]:
label_encoded_dict = dict(zip(clean_data['Assignment group'].unique(), clean_data['Assignment group LabelEncoded'].unique()))
len(label_encoded_dict)
Out[93]:
62

Feature Extraction : Bag of Words using CountVectorizer

In [94]:
from sklearn.feature_extraction.text import CountVectorizer

CV = CountVectorizer()

X_BoW = CV.fit_transform(clean_data['Final_Text']).toarray()
y = clean_data['Assignment group LabelEncoded']

print("Shape of Input Feature :",np.shape(X_BoW))
print("Shape of Target Feature :",np.shape(y))
Shape of Input Feature : (306006, 79)
Shape of Target Feature : (306006,)
In [95]:
# Splitting Train Test 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_BoW, y, test_size=0.3, random_state = 1)
print('\033[1mShape of the training set:\033[0m', X_train.shape, X_test.shape)
print('\033[1mShape of the test set:\033[0m', y_train.shape, y_test.shape)
Shape of the training set: (214204, 79) (91802, 79)
Shape of the test set: (214204,) (91802,)
In [96]:
def run_classification(estimator, X_train, X_test, y_train, y_test, arch_name=None, pipelineRequired=True, isDeepModel=False):
    # train the model
    clf = estimator

    if pipelineRequired :
        clf = Pipeline([('tfidf', TfidfTransformer()),
                     ('clf', estimator),
                     ])
      
    if isDeepModel :
        clf.fit(X_train, y_train, validation_data=(X_test, y_test),epochs=10, batch_size=128,verbose=1,callbacks=call_backs(arch_name))
        # predict from the clasiffier
        y_pred = clf.predict(X_test)
        y_pred = np.argmax(y_pred, axis=1)
        y_train_pred = clf.predict(X_train)
        y_train_pred = np.argmax(y_train_pred, axis=1)
    else :
        clf.fit(X_train, y_train)
        # predict from the clasiffier
        y_pred = clf.predict(X_test)
        y_train_pred = clf.predict(X_train)
    
    print('Estimator:', clf)
    print('='*80)
    print('Training accuracy: %.2f%%' % (accuracy_score(y_train,y_train_pred) * 100))
    print('Testing accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))
    print('='*80)
    print('Confusion matrix:\n %s' % (confusion_matrix(y_test, y_pred)))
    print('='*80)
    print('Classification report:\n %s' % (classification_report(y_test, y_pred)))

Logistic Regression

In [98]:
run_classification(LogisticRegression(), X_train, X_test, y_train, y_test)
C:\Users\aroy\Anaconda3\envs\myenv\lib\site-packages\sklearn\linear_model\_logistic.py:764: ConvergenceWarning:

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

Estimator: Pipeline(steps=[('tfidf', TfidfTransformer()), ('clf', LogisticRegression())])
================================================================================
Training accuracy: 15.53%
Testing accuracy: 15.40%
================================================================================
Confusion matrix:
 [[14008     0     0 ...     0     0     0]
 [  756     0     0 ...     0     0     0]
 [ 2805     0     0 ...     0     0     0]
 ...
 [   19     0     0 ...     0     0     0]
 [ 3951     0     0 ...     0     0     0]
 [ 3254     0     0 ...     0     0     0]]
================================================================================
Classification report:
               precision    recall  f1-score   support

           0       0.15      1.00      0.27     14046
           1       0.00      0.00      0.00       756
           2       0.00      0.00      0.00      2805
           3       0.00      0.00      0.00      1150
           4       0.00      0.00      0.00      1562
           5       0.00      0.00      0.00      1063
           6       0.00      0.00      0.00      3979
           7       0.00      0.00      0.00      1223
           8       0.00      0.00      0.00      2814
           9       0.00      0.00      0.00       416
          10       0.00      0.00      0.00      3638
          11       0.00      0.00      0.00      1068
          12       0.00      0.00      0.00      2900
          13       0.00      0.00      0.00      1541
          14       0.00      0.00      0.00      1612
          15       0.00      0.00      0.00       797
          16       0.00      0.00      0.00      1490
          17       0.00      0.00      0.00      1527
          18       0.00      0.00      0.00      3377
          19       0.00      0.00      0.00      2373
          20       0.00      0.00      0.00       478
          21       0.00      0.00      0.00       970
          22       0.00      0.00      0.00      2943
          23       0.00      0.00      0.00      1205
          24       0.00      0.00      0.00       446
          25       0.00      0.00      0.00       758
          26       0.00      0.00      0.00        81
          27       0.00      0.00      0.00      1969
          28       0.00      0.00      0.00      1041
          29       0.00      0.00      0.00       224
          30       0.00      0.00      0.00       672
          31       0.00      0.00      0.00      1107
          32       0.98      0.02      0.03      3513
          33       0.00      0.00      0.00      1458
          34       0.00      0.00      0.00      1663
          35       0.00      0.00      0.00       463
          36       0.00      0.00      0.00       277
          37       0.00      0.00      0.00       905
          38       0.00      0.00      0.00      1087
          39       0.00      0.00      0.00       157
          40       0.00      0.00      0.00       689
          41       0.00      0.00      0.00       281
          42       0.00      0.00      0.00       242
          43       0.65      0.05      0.08      1743
          44       0.00      0.00      0.00       579
          45       0.00      0.00      0.00       416
          46       0.00      0.00      0.00       333
          47       0.00      0.00      0.00       400
          48       0.00      0.00      0.00        24
          49       0.00      0.00      0.00       601
          50       0.00      0.00      0.00        42
          51       0.00      0.00      0.00       378
          52       0.00      0.00      0.00      2455
          53       0.00      0.00      0.00       402
          54       0.00      0.00      0.00      2334
          55       0.00      0.00      0.00       407
          56       0.00      0.00      0.00        75
          57       0.00      0.00      0.00        50
          58       0.00      0.00      0.00      1573
          59       0.00      0.00      0.00        19
          60       0.00      0.00      0.00      3951
          61       0.00      0.00      0.00      3254

    accuracy                           0.15     91802
   macro avg       0.03      0.02      0.01     91802
weighted avg       0.07      0.15      0.04     91802

C:\Users\aroy\Anaconda3\envs\myenv\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

Naive Bayes Classifier

In [99]:
run_classification(MultinomialNB(), X_train, X_test, y_train, y_test)
Estimator: Pipeline(steps=[('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
================================================================================
Training accuracy: 15.48%
Testing accuracy: 15.37%
================================================================================
Confusion matrix:
 [[13946    22     0 ...    17     0     0]
 [  729    27     0 ...     0     0     0]
 [ 2800     0     5 ...     0     0     0]
 ...
 [   19     0     0 ...     0     0     0]
 [ 3950     0     0 ...     0     0     1]
 [ 3251     0     0 ...     0     0     3]]
================================================================================
Classification report:
               precision    recall  f1-score   support

           0       0.15      0.99      0.26     14046
           1       0.55      0.04      0.07       756
           2       1.00      0.00      0.00      2805
           3       0.00      0.00      0.00      1150
           4       0.00      0.00      0.00      1562
           5       0.00      0.00      0.00      1063
           6       0.00      0.00      0.00      3979
           7       0.00      0.00      0.00      1223
           8       0.00      0.00      0.00      2814
           9       0.00      0.00      0.00       416
          10       0.00      0.00      0.00      3638
          11       0.00      0.00      0.00      1068
          12       0.00      0.00      0.00      2900
          13       0.00      0.00      0.00      1541
          14       0.00      0.00      0.00      1612
          15       0.00      0.00      0.00       797
          16       0.00      0.00      0.00      1490
          17       0.00      0.00      0.00      1527
          18       0.00      0.00      0.00      3377
          19       0.00      0.00      0.00      2373
          20       0.00      0.00      0.00       478
          21       0.00      0.00      0.00       970
          22       0.00      0.00      0.00      2943
          23       0.00      0.00      0.00      1205
          24       0.00      0.00      0.00       446
          25       0.00      0.00      0.00       758
          26       0.00      0.00      0.00        81
          27       0.00      0.00      0.00      1969
          28       0.00      0.00      0.00      1041
          29       0.00      0.00      0.00       224
          30       0.00      0.00      0.00       672
          31       0.00      0.00      0.00      1107
          32       0.69      0.02      0.03      3513
          33       0.00      0.00      0.00      1458
          34       0.00      0.00      0.00      1663
          35       0.00      0.00      0.00       463
          36       0.00      0.00      0.00       277
          37       0.00      0.00      0.00       905
          38       0.00      0.00      0.00      1087
          39       0.00      0.00      0.00       157
          40       0.00      0.00      0.00       689
          41       0.00      0.00      0.00       281
          42       0.00      0.00      0.00       242
          43       0.65      0.05      0.08      1743
          44       0.00      0.00      0.00       579
          45       0.00      0.00      0.00       416
          46       0.00      0.00      0.00       333
          47       0.00      0.00      0.00       400
          48       0.00      0.00      0.00        24
          49       0.00      0.00      0.00       601
          50       0.00      0.00      0.00        42
          51       0.00      0.00      0.00       378
          52       0.00      0.00      0.00      2455
          53       0.00      0.00      0.00       402
          54       0.00      0.00      0.00      2334
          55       0.00      0.00      0.00       407
          56       0.00      0.00      0.00        75
          57       0.00      0.00      0.00        50
          58       0.00      0.00      0.00      1573
          59       0.00      0.00      0.00        19
          60       0.00      0.00      0.00      3951
          61       0.75      0.00      0.00      3254

    accuracy                           0.15     91802
   macro avg       0.06      0.02      0.01     91802
weighted avg       0.12      0.15      0.04     91802

C:\Users\aroy\Anaconda3\envs\myenv\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

K-nearest Neighbor

In [100]:
run_classification(KNeighborsClassifier(), X_train, X_test, y_train, y_test)
Estimator: Pipeline(steps=[('tfidf', TfidfTransformer()), ('clf', KNeighborsClassifier())])
================================================================================
Training accuracy: 2.04%
Testing accuracy: 2.06%
================================================================================
Confusion matrix:
 [[89  0  0 ...  0  0  0]
 [27  0  0 ...  0  0  0]
 [ 0  0  5 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  1]
 [ 0  0  0 ...  0  0  3]]
================================================================================
Classification report:
               precision    recall  f1-score   support

           0       0.73      0.01      0.01     14046
           1       0.00      0.00      0.00       756
           2       1.00      0.00      0.00      2805
           3       0.00      0.00      0.00      1150
           4       0.00      0.00      0.00      1562
           5       0.00      0.00      0.00      1063
           6       0.00      0.00      0.00      3979
           7       0.00      0.00      0.00      1223
           8       0.00      0.00      0.00      2814
           9       0.00      0.00      0.00       416
          10       0.00      0.00      0.00      3638
          11       0.00      0.00      0.00      1068
          12       0.00      0.00      0.00      2900
          13       0.00      0.00      0.00      1541
          14       0.00      0.00      0.00      1612
          15       0.00      0.00      0.00       797
          16       0.00      0.00      0.00      1490
          17       0.00      0.00      0.00      1527
          18       0.00      0.00      0.00      3377
          19       0.00      0.00      0.00      2373
          20       0.00      0.00      0.00       478
          21       0.00      0.00      0.00       970
          22       0.00      0.00      0.00      2943
          23       0.00      0.00      0.00      1205
          24       0.00      0.00      0.00       446
          25       0.00      0.00      0.00       758
          26       0.00      0.00      0.00        81
          27       0.00      0.00      0.00      1969
          28       0.00      0.00      0.00      1041
          29       0.00      0.00      0.00       224
          30       0.00      0.00      0.00       672
          31       0.00      0.00      0.00      1107
          32       0.98      0.02      0.03      3513
          33       0.00      0.00      0.00      1458
          34       0.02      1.00      0.04      1663
          35       0.00      0.00      0.00       463
          36       0.00      0.00      0.00       277
          37       0.00      0.00      0.00       905
          38       0.00      0.00      0.00      1087
          39       0.00      0.00      0.00       157
          40       0.00      0.00      0.00       689
          41       0.00      0.00      0.00       281
          42       0.00      0.00      0.00       242
          43       0.65      0.05      0.08      1743
          44       0.00      0.00      0.00       579
          45       0.00      0.00      0.00       416
          46       0.00      0.00      0.00       333
          47       0.00      0.00      0.00       400
          48       0.00      0.00      0.00        24
          49       0.00      0.00      0.00       601
          50       0.00      0.00      0.00        42
          51       0.00      0.00      0.00       378
          52       0.00      0.00      0.00      2455
          53       0.00      0.00      0.00       402
          54       0.00      0.00      0.00      2334
          55       0.00      0.00      0.00       407
          56       0.00      0.00      0.00        75
          57       0.00      0.00      0.00        50
          58       0.00      0.00      0.00      1573
          59       0.00      0.00      0.00        19
          60       0.00      0.00      0.00      3951
          61       0.75      0.00      0.00      3254

    accuracy                           0.02     91802
   macro avg       0.07      0.02      0.00     91802
weighted avg       0.22      0.02      0.01     91802

C:\Users\aroy\Anaconda3\envs\myenv\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

Support Vector Machine (SVM)

In [101]:
run_classification(LinearSVC(), X_train, X_test, y_train, y_test)
Estimator: Pipeline(steps=[('tfidf', TfidfTransformer()), ('clf', LinearSVC())])
================================================================================
Training accuracy: 15.54%
Testing accuracy: 15.41%
================================================================================
Confusion matrix:
 [[14008     0     0 ...     0     0     0]
 [  756     0     0 ...     0     0     0]
 [ 2800     0     5 ...     0     0     0]
 ...
 [   19     0     0 ...     0     0     0]
 [ 3950     0     0 ...     0     0     1]
 [ 3251     0     0 ...     0     0     3]]
================================================================================
Classification report:
               precision    recall  f1-score   support

           0       0.15      1.00      0.27     14046
           1       0.00      0.00      0.00       756
           2       1.00      0.00      0.00      2805
           3       0.00      0.00      0.00      1150
           4       0.00      0.00      0.00      1562
           5       0.00      0.00      0.00      1063
           6       0.00      0.00      0.00      3979
           7       0.00      0.00      0.00      1223
           8       0.00      0.00      0.00      2814
           9       0.00      0.00      0.00       416
          10       0.00      0.00      0.00      3638
          11       0.00      0.00      0.00      1068
          12       0.00      0.00      0.00      2900
          13       0.00      0.00      0.00      1541
          14       0.00      0.00      0.00      1612
          15       0.00      0.00      0.00       797
          16       0.00      0.00      0.00      1490
          17       0.00      0.00      0.00      1527
          18       0.00      0.00      0.00      3377
          19       0.00      0.00      0.00      2373
          20       0.00      0.00      0.00       478
          21       0.00      0.00      0.00       970
          22       0.00      0.00      0.00      2943
          23       0.00      0.00      0.00      1205
          24       0.00      0.00      0.00       446
          25       0.00      0.00      0.00       758
          26       0.00      0.00      0.00        81
          27       0.00      0.00      0.00      1969
          28       0.00      0.00      0.00      1041
          29       0.00      0.00      0.00       224
          30       0.00      0.00      0.00       672
          31       0.00      0.00      0.00      1107
          32       0.98      0.02      0.03      3513
          33       0.00      0.00      0.00      1458
          34       0.00      0.00      0.00      1663
          35       0.00      0.00      0.00       463
          36       0.00      0.00      0.00       277
          37       0.00      0.00      0.00       905
          38       0.00      0.00      0.00      1087
          39       0.00      0.00      0.00       157
          40       0.00      0.00      0.00       689
          41       0.00      0.00      0.00       281
          42       0.00      0.00      0.00       242
          43       0.65      0.05      0.08      1743
          44       0.00      0.00      0.00       579
          45       0.00      0.00      0.00       416
          46       0.00      0.00      0.00       333
          47       0.00      0.00      0.00       400
          48       0.00      0.00      0.00        24
          49       0.00      0.00      0.00       601
          50       0.00      0.00      0.00        42
          51       0.00      0.00      0.00       378
          52       0.00      0.00      0.00      2455
          53       0.00      0.00      0.00       402
          54       0.00      0.00      0.00      2334
          55       0.00      0.00      0.00       407
          56       0.00      0.00      0.00        75
          57       0.00      0.00      0.00        50
          58       0.00      0.00      0.00      1573
          59       0.00      0.00      0.00        19
          60       0.00      0.00      0.00      3951
          61       0.75      0.00      0.00      3254

    accuracy                           0.15     91802
   macro avg       0.06      0.02      0.01     91802
weighted avg       0.13      0.15      0.04     91802

C:\Users\aroy\Anaconda3\envs\myenv\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

Decision Tree

In [102]:
run_classification(DecisionTreeClassifier(), X_train, X_test, y_train, y_test)
Estimator: Pipeline(steps=[('tfidf', TfidfTransformer()),
                ('clf', DecisionTreeClassifier())])
================================================================================
Training accuracy: 15.54%
Testing accuracy: 15.41%
================================================================================
Confusion matrix:
 [[14008     0     0 ...     0     0     0]
 [  756     0     0 ...     0     0     0]
 [ 2800     0     5 ...     0     0     0]
 ...
 [   19     0     0 ...     0     0     0]
 [ 3950     0     0 ...     0     0     1]
 [ 3251     0     0 ...     0     0     3]]
================================================================================
Classification report:
               precision    recall  f1-score   support

           0       0.15      1.00      0.27     14046
           1       0.00      0.00      0.00       756
           2       1.00      0.00      0.00      2805
           3       0.00      0.00      0.00      1150
           4       0.00      0.00      0.00      1562
           5       0.00      0.00      0.00      1063
           6       0.00      0.00      0.00      3979
           7       0.00      0.00      0.00      1223
           8       0.00      0.00      0.00      2814
           9       0.00      0.00      0.00       416
          10       0.00      0.00      0.00      3638
          11       0.00      0.00      0.00      1068
          12       0.00      0.00      0.00      2900
          13       0.00      0.00      0.00      1541
          14       0.00      0.00      0.00      1612
          15       0.00      0.00      0.00       797
          16       0.00      0.00      0.00      1490
          17       0.00      0.00      0.00      1527
          18       0.00      0.00      0.00      3377
          19       0.00      0.00      0.00      2373
          20       0.00      0.00      0.00       478
          21       0.00      0.00      0.00       970
          22       0.00      0.00      0.00      2943
          23       0.00      0.00      0.00      1205
          24       0.00      0.00      0.00       446
          25       0.00      0.00      0.00       758
          26       0.00      0.00      0.00        81
          27       0.00      0.00      0.00      1969
          28       0.00      0.00      0.00      1041
          29       0.00      0.00      0.00       224
          30       0.00      0.00      0.00       672
          31       0.00      0.00      0.00      1107
          32       0.98      0.02      0.03      3513
          33       0.00      0.00      0.00      1458
          34       0.00      0.00      0.00      1663
          35       0.00      0.00      0.00       463
          36       0.00      0.00      0.00       277
          37       0.00      0.00      0.00       905
          38       0.00      0.00      0.00      1087
          39       0.00      0.00      0.00       157
          40       0.00      0.00      0.00       689
          41       0.00      0.00      0.00       281
          42       0.00      0.00      0.00       242
          43       0.65      0.05      0.08      1743
          44       0.00      0.00      0.00       579
          45       0.00      0.00      0.00       416
          46       0.00      0.00      0.00       333
          47       0.00      0.00      0.00       400
          48       0.00      0.00      0.00        24
          49       0.00      0.00      0.00       601
          50       0.00      0.00      0.00        42
          51       0.00      0.00      0.00       378
          52       0.00      0.00      0.00      2455
          53       0.00      0.00      0.00       402
          54       0.00      0.00      0.00      2334
          55       0.00      0.00      0.00       407
          56       0.00      0.00      0.00        75
          57       0.00      0.00      0.00        50
          58       0.00      0.00      0.00      1573
          59       0.00      0.00      0.00        19
          60       0.00      0.00      0.00      3951
          61       0.75      0.00      0.00      3254

    accuracy                           0.15     91802
   macro avg       0.06      0.02      0.01     91802
weighted avg       0.13      0.15      0.04     91802

C:\Users\aroy\Anaconda3\envs\myenv\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

Random Forest

In [103]:
run_classification(RandomForestClassifier(n_estimators=100), X_train, X_test, y_train, y_test)
Estimator: Pipeline(steps=[('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier())])
================================================================================
Training accuracy: 15.54%
Testing accuracy: 15.41%
================================================================================
Confusion matrix:
 [[14008     0     0 ...     0     0     0]
 [  756     0     0 ...     0     0     0]
 [ 2800     0     5 ...     0     0     0]
 ...
 [   19     0     0 ...     0     0     0]
 [ 3950     0     0 ...     0     0     1]
 [ 3251     0     0 ...     0     0     3]]
================================================================================
Classification report:
               precision    recall  f1-score   support

           0       0.15      1.00      0.27     14046
           1       0.00      0.00      0.00       756
           2       1.00      0.00      0.00      2805
           3       0.00      0.00      0.00      1150
           4       0.00      0.00      0.00      1562
           5       0.00      0.00      0.00      1063
           6       0.00      0.00      0.00      3979
           7       0.00      0.00      0.00      1223
           8       0.00      0.00      0.00      2814
           9       0.00      0.00      0.00       416
          10       0.00      0.00      0.00      3638
          11       0.00      0.00      0.00      1068
          12       0.00      0.00      0.00      2900
          13       0.00      0.00      0.00      1541
          14       0.00      0.00      0.00      1612
          15       0.00      0.00      0.00       797
          16       0.00      0.00      0.00      1490
          17       0.00      0.00      0.00      1527
          18       0.00      0.00      0.00      3377
          19       0.00      0.00      0.00      2373
          20       0.00      0.00      0.00       478
          21       0.00      0.00      0.00       970
          22       0.00      0.00      0.00      2943
          23       0.00      0.00      0.00      1205
          24       0.00      0.00      0.00       446
          25       0.00      0.00      0.00       758
          26       0.00      0.00      0.00        81
          27       0.00      0.00      0.00      1969
          28       0.00      0.00      0.00      1041
          29       0.00      0.00      0.00       224
          30       0.00      0.00      0.00       672
          31       0.00      0.00      0.00      1107
          32       0.98      0.02      0.03      3513
          33       0.00      0.00      0.00      1458
          34       0.00      0.00      0.00      1663
          35       0.00      0.00      0.00       463
          36       0.00      0.00      0.00       277
          37       0.00      0.00      0.00       905
          38       0.00      0.00      0.00      1087
          39       0.00      0.00      0.00       157
          40       0.00      0.00      0.00       689
          41       0.00      0.00      0.00       281
          42       0.00      0.00      0.00       242
          43       0.65      0.05      0.08      1743
          44       0.00      0.00      0.00       579
          45       0.00      0.00      0.00       416
          46       0.00      0.00      0.00       333
          47       0.00      0.00      0.00       400
          48       0.00      0.00      0.00        24
          49       0.00      0.00      0.00       601
          50       0.00      0.00      0.00        42
          51       0.00      0.00      0.00       378
          52       0.00      0.00      0.00      2455
          53       0.00      0.00      0.00       402
          54       0.00      0.00      0.00      2334
          55       0.00      0.00      0.00       407
          56       0.00      0.00      0.00        75
          57       0.00      0.00      0.00        50
          58       0.00      0.00      0.00      1573
          59       0.00      0.00      0.00        19
          60       0.00      0.00      0.00      3951
          61       0.75      0.00      0.00      3254

    accuracy                           0.15     91802
   macro avg       0.06      0.02      0.01     91802
weighted avg       0.13      0.15      0.04     91802

C:\Users\aroy\Anaconda3\envs\myenv\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

In [ ]:
 
In [ ]: